/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.tools;
import java.io.*;
import java.net.*;
import java.util.*;
import java.text.*;
import java.util.logging.*;
import net.nutch.io.*;
import net.nutch.db.*;
import net.nutch.util.*;
import net.nutch.fetcher.*;
import net.nutch.indexer.*;
/*
*/
public class CrawlTool {
public static final Logger LOG =
LogFormatter.getLogger("net.nutch.tools.CrawlTool");
static {
NutchConf.addConfResource("crawl-tool.xml");
}
/** Returns a string representing the current date and time that also sorts
* lexicographically by date. */
private static String getDate() {
return new SimpleDateFormat("yyyyMMddHHmmss").format
(new Date(System.currentTimeMillis()));
}
/** Returns the pathname of the latest segment in a segments directory. */
private static String getLatestSegment(String segmentsDir) {
String[] allSegments = new File(segmentsDir).list();
Arrays.sort(allSegments);
return segmentsDir + "/" + allSegments[allSegments.length-1];
}
/* Perform complete crawling and indexing given a set of root urls. */
public static void main(String args[]) throws Exception {
if (args.length < 1) {
System.out.println
("Usage: CrawlTool <root_url_file> [-dir d] [-threads n] [-depth i] [-delay s] [-showThreadID]");
return;
}
String rootUrlFile = args[0];
String dir = "crawl-" + getDate();
int threads = NutchConf.getInt("fetcher.threads.fetch", 10);
int serverDelay = NutchConf.getInt("fetcher.server.delay", 1);
int depth = 5;
boolean showThreadID = false;
for (int i = 1; i < args.length; i++) {
if ("-dir".equals(args[i])) {
dir = args[i+1];
i++;
} else if ("-threads".equals(args[i])) {
threads = Integer.parseInt(args[i+1]);
i++;
} else if ("-depth".equals(args[i])) {
depth = Integer.parseInt(args[i+1]);
i++;
} else if ("-delay".equals(args[i])) {
serverDelay = Integer.parseInt(args[i+1]);
i++;
} else if ("-showThreadID".equals(args[i])) {
showThreadID = true;
}
}
if (new File(dir).exists())
throw new RuntimeException(dir + " already exists.");
LOG.info("crawl started in: " + dir);
LOG.info("rootUrlFile = " + rootUrlFile);
LOG.info("threads = " + threads);
LOG.info("depth = " + depth);
LOG.info("serverDelay = " + serverDelay);
String db = dir + "/db";
String segments = dir + "/segments";
// initialize the web database
WebDBAdminTool.main(new String[] { db, "-create" } );
// inject the root urls into the database
WebDBInjector.main(new String[] { db, "-urlfile", rootUrlFile } );
for (int i = 0; i < depth; i++) {
// generate a new segment
FetchListTool.main(new String[] { db, segments } );
String segment = getLatestSegment(segments);
// fetch the new segment
Fetcher.main(new String[] { "-threads", ""+threads,
"-delay", ""+serverDelay,
segment } );
// update the database
UpdateDatabaseTool.main(new String[] { db, segment } );
}
// Re-fetch everything to get the complete set of incoming anchor texts
// associated with each page. We should fix this, so that we can update
// the previously fetched segments with the anchors that are now in the
// database, but until that algorithm is written, we re-fetch.
// delete all the old segment data
FileUtil.fullyDelete(new File(segments));
// generate a single segment containing all pages in the db
FetchListTool.main(new String[] { db, segments,
"-adddays", ""+Integer.MAX_VALUE } );
String segment = getLatestSegment(segments);
// re-fetch everything
Fetcher.main(new String[] { "-threads", ""+threads,
"-delay", ""+serverDelay,
segment } );
// index, dedup & merge
IndexSegment.main(new String[] { segment } );
DeleteDuplicates.main(new String[] { segments, dir + "/dedup" } );
IndexMerger.main(new String[] { dir + "/index", segment } );
LOG.info("crawl finished: " + dir);
}
}